Resampling Methods

Module 08

Ray J. Hoobler

Libraries

Code
library(tidyverse)
library(broom)
library(patchwork)

Cross-Validation

The Validation Set Approach

Training Set (50%) Observations: 7, 22, 13, ... Validation Set (50%) Observations: 91, ... Random 50% Split

The Validation Set Approach

Auto MPG dataset (Quinlan 1993)

  • 398 observations
  • 9 variables
  • Predict mpg using horsepower
Code
auto_mpg_column_names <- c("mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "model_year", "origin", "car_name")

auto_mpg_spaces <- read_table("datasets/auto+mpg/auto-mpg.data", na = "?", col_names = auto_mpg_column_names[1:8]) 

auto_mpg_tab <- read_delim("datasets/auto+mpg/auto-mpg.data", delim = "\t", col_names = FALSE) |> select(2)

# auto_mpg_spaces
# auto_mpg_tab

auto_mpg_9 <- bind_cols(auto_mpg_spaces, auto_mpg_tab) |> 
  set_names(auto_mpg_column_names)

summary(auto_mpg_9)
      mpg          cylinders      displacement     horsepower        weight    
 Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0   Min.   :1613  
 1st Qu.:17.50   1st Qu.:4.000   1st Qu.:104.2   1st Qu.: 75.0   1st Qu.:2224  
 Median :23.00   Median :4.000   Median :148.5   Median : 93.5   Median :2804  
 Mean   :23.51   Mean   :5.455   Mean   :193.4   Mean   :104.5   Mean   :2970  
 3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:262.0   3rd Qu.:126.0   3rd Qu.:3608  
 Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0   Max.   :5140  
                                                 NA's   :6                     
  acceleration     model_year        origin        car_name        
 Min.   : 8.00   Min.   :70.00   Min.   :1.000   Length:398        
 1st Qu.:13.82   1st Qu.:73.00   1st Qu.:1.000   Class :character  
 Median :15.50   Median :76.00   Median :1.000   Mode  :character  
 Mean   :15.57   Mean   :76.01   Mean   :1.573                     
 3rd Qu.:17.18   3rd Qu.:79.00   3rd Qu.:2.000                     
 Max.   :24.80   Max.   :82.00   Max.   :3.000                     
                                                                   
Code
# there are 6 NA values in the horsepower column 

auto_mpg_9_final <- auto_mpg_9 |> 
  drop_na(horsepower)

summary(auto_mpg_9_final)
      mpg          cylinders      displacement     horsepower        weight    
 Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0   Min.   :1613  
 1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0   1st Qu.:2225  
 Median :22.75   Median :4.000   Median :151.0   Median : 93.5   Median :2804  
 Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5   Mean   :2978  
 3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0   3rd Qu.:3615  
 Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0   Max.   :5140  
  acceleration     model_year        origin        car_name        
 Min.   : 8.00   Min.   :70.00   Min.   :1.000   Length:392        
 1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000   Class :character  
 Median :15.50   Median :76.00   Median :1.000   Mode  :character  
 Mean   :15.54   Mean   :75.98   Mean   :1.577                     
 3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000                     
 Max.   :24.80   Max.   :82.00   Max.   :3.000                     

Plot of the data

Code
auto_mpg_9_final |> 
  ggplot(aes(x = horsepower, y = mpg)) +
  geom_point() +
  labs(
    title = "Auto MPG Dataset",
    x = "Horsepower",
    y = "MPG"
  ) +
  labs(
    title = "Exploratory Data Analysis (EDA) of Auto MPG dataset",
    subtitle = "Understanding the relationship between Horsepower and MPG",
    x = "Horsepower",
    y = "MPG",
    caption = "Source: Auto MPG dataset from UCI Machine Learning Repository"
  ) +
  theme_light()

Example: Validation Set Approach (1/)

Random 50% test train split of the data

Code
set.seed(42)
# Create a random 50% split of the data index using the response variable mpg

auto_mpg_train <- caret::createDataPartition(auto_mpg_9_final$mpg, p = 0.5, list = FALSE)
head(auto_mpg_train, 10)
      Resample1
 [1,]         1
 [2,]         3
 [3,]         4
 [4,]         5
 [5,]         6
 [6,]         7
 [7,]         8
 [8,]        10
 [9,]        12
[10,]        18

Example: Validation Set Approach (2/)

Random 50% test train split of the data

Code
auto_mpg_train_data <- auto_mpg_9_final[auto_mpg_train,]
auto_mpg_test_data <- auto_mpg_9_final[-auto_mpg_train,]

auto_mpg_train_data
Code
auto_mpg_test_data

Example: Validation Set Approach (3/)

Fit a linear model on the training data

Code
auto_mpg_lm <- lm(mpg ~ horsepower, data = auto_mpg_train_data)
summary(auto_mpg_lm)

Call:
lm(formula = mpg ~ horsepower, data = auto_mpg_train_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-13.5314  -3.4708  -0.0281   2.5311  12.2426 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 40.144797   0.949810   42.27   <2e-16 ***
horsepower  -0.161297   0.008591  -18.77   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.602 on 196 degrees of freedom
Multiple R-squared:  0.6427,    Adjusted R-squared:  0.6408 
F-statistic: 352.5 on 1 and 196 DF,  p-value: < 2.2e-16

Example: Validation Set Approach (4/)

Predict the test data

Code
auto_mpg_test_pred <- predict(auto_mpg_lm, newdata = auto_mpg_test_data)
auto_mpg_test_pred
        1         2         3         4         5         6         7         8 
13.530841  3.853039 12.724358 15.950292  3.853039 24.821610 24.821610 24.499017 
        9        10        11        12        13        14        15        16 
25.950687 32.725149 26.111984 21.918270 25.628094  5.466006  7.885456  9.014533 
       17        18        19        20        21        22        23        24 
25.950687 25.628094 24.015127 23.208643 25.950687 13.530841 11.111390 22.402160 
       25        26        27        28        29        30        31        32 
28.531435 24.015127 26.273281 29.015325 30.466995 27.241061 31.434775 26.273281 
       33        34        35        36        37        38        39        40 
15.950292 15.466402  6.595083 15.143808 14.337325 15.950292 26.111984 29.015325 
       41        42        43        44        45        46        47        48 
26.273281 25.305501 24.499017 27.241061 25.950687 11.917874  8.208050 15.950292 
       49        50        51        52        53        54        55        56 
14.659918  5.466006 24.015127 24.015127 25.950687 24.821610 11.111390 24.982907 
       57        58        59        60        61        62        63        64 
25.628094 22.886050 16.756775  3.046555 25.466797 24.821610 24.015127 29.337918 
       65        66        67        68        69        70        71        72 
27.241061 28.047544 22.402160 23.208643 17.563259 15.950292 15.950292 28.047544 
       73        74        75        76        77        78        79        80 
24.821610 28.531435 12.724358 15.950292 16.272885 23.208643 22.402160 24.821610 
       81        82        83        84        85        86        87        88 
22.402160 19.337522 28.047544 26.757171 24.499017 25.628094 24.821610 24.337720 
       89        90        91        92        93        94        95        96 
21.595676 27.079764 25.305501 27.402358 26.757171 17.563259 15.950292 24.015127 
       97        98        99       100       101       102       103       104 
23.208643 27.079764 31.757369 31.596072 24.015127 22.402160 28.047544 15.950292 
      105       106       107       108       109       110       111       112 
16.756775 19.176226 29.176621 16.756775 23.208643 24.337720 11.111390 12.724358 
      113       114       115       116       117       118       119       120 
16.111588 25.789391 26.757171 29.337918 27.563654 24.499017 22.402160 22.402160 
      121       122       123       124       125       126       127       128 
32.402556 29.499215 31.757369 30.466995 22.402160 17.563259 25.628094 26.434577 
      129       130       131       132       133       134       135       136 
22.402160 16.756775 17.724555 17.563259 24.499017 24.821610 26.434577 24.499017 
      137       138       139       140       141       142       143       144 
21.595676 28.692731 26.434577 25.628094 19.176226 17.885852 18.369742 15.143808 
      145       146       147       148       149       150       151       152 
29.660512 27.724951 28.692731 25.628094 28.854028 28.854028 29.660512 25.628094 
      153       154       155       156       157       158       159       160 
21.595676 21.595676 25.628094 27.886248 30.466995 29.660512 25.628094 25.628094 
      161       162       163       164       165       166       167       168 
28.047544 25.305501 29.660512 32.402556 29.337918 29.337918 18.853632 24.015127 
      169       170       171       172       173       174       175       176 
25.950687 26.595874 22.402160 29.337918 30.144402 29.176621 29.983105 29.660512 
      177       178       179       180       181       182       183       184 
29.660512 28.047544 24.015127 27.886248 21.434380 20.789193 26.434577 25.950687 
      185       186       187       188       189       190       191       192 
26.434577 28.208841 29.176621 28.854028 29.337918 25.305501 24.660314 26.595874 
      193       194 
25.628094 26.918468 

Example: Validation Set Approach (5/)

Calculate the MSE

Code
mpg_mse_lm <- mean((auto_mpg_test_data$mpg - auto_mpg_test_pred)^2)
mpg_mse_lm
[1] 27.06624

Example: Validation Set Approach (6/)

Repeat the process for a quadratic model

Code
auto_mpg_lm_quad <- lm(mpg ~ horsepower + I(horsepower^2), data = auto_mpg_train_data)
summary(auto_mpg_lm_quad)

Call:
lm(formula = mpg ~ horsepower + I(horsepower^2), data = auto_mpg_train_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-14.5789  -2.2724  -0.0706   2.2250  12.0560 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)     57.8265703  2.3769451  24.328  < 2e-16 ***
horsepower      -0.4878086  0.0418201 -11.664  < 2e-16 ***
I(horsepower^2)  0.0013261  0.0001671   7.936  1.6e-13 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.011 on 195 degrees of freedom
Multiple R-squared:  0.7299,    Adjusted R-squared:  0.7271 
F-statistic: 263.5 on 2 and 195 DF,  p-value: < 2.2e-16

Example: Validation Set Approach (7/)

Predict the test data of the quadratic model

Code
auto_mpg_test_pred_quad <- predict(auto_mpg_lm_quad, newdata = auto_mpg_test_data)
auto_mpg_test_pred_quad
       1        2        3        4        5        6        7        8 
13.44171 15.20435 13.22392 14.49293 15.20435 23.45297 23.45297 22.98658 
       9       10       11       12       13       14       15       16 
25.16887 38.19344 25.42461 19.63740 24.66535 14.24752 13.30957 13.07608 
      17       18       19       20       21       22       23       24 
25.16887 24.66535 22.30689 21.22712 25.16887 13.44171 12.98724 20.21365 
      25       26       27       28       29       30       31       32 
29.57895 22.30689 25.68300 30.48143 33.33208 27.28904 35.35187 25.68300 
      33       34       35       36       37       38       39       40 
14.49293 14.23495 13.73554 14.07622 13.72581 14.49293 25.42461 30.48143 
      41       42       43       44       45       46       47       48 
25.68300 24.17244 22.98658 27.28904 25.16887 13.07243 13.22959 14.49293 
      49       50       51       52       53       54       55       56 
13.85802 14.24752 22.30689 22.30689 25.16887 23.45297 12.98724 23.69014 
      57       58       59       60       61       62       63       64 
24.66535 20.81378 14.97595 15.78223 24.41757 23.45297 22.30689 31.09634 
      65       66       67       68       69       70       71       72 
27.28904 28.70034 20.21365 21.22712 15.52528 14.49293 14.49293 28.70034 
      73       74       75       76       77       78       79       80 
23.45297 29.57895 13.22392 14.49293 14.67818 21.22712 20.21365 23.45297 
      81       82       83       84       85       86       87       88 
20.21365 16.96719 28.70034 26.47408 22.98658 24.66535 23.45297 22.75737 
      89       90       91       92       93       94       95       96 
19.26649 27.01474 24.17244 27.56599 26.47408 15.52528 14.49293 22.30689 
      97       98       99      100      101      102      103      104 
21.22712 27.01474 36.04635 35.69778 22.30689 20.21365 28.70034 14.49293 
     105      106      107      108      109      110      111      112 
14.97595 16.82285 30.78756 14.97595 21.22712 22.75737 12.98724 13.22392 
     113      114      115      116      117      118      119      120 
14.58423 24.91579 26.47408 31.09634 27.84560 22.98658 20.21365 20.21365 
     121      122      123      124      125      126      127      128 
37.46713 31.40777 36.04635 33.33208 20.21365 15.52528 24.66535 25.94404 
     129      130      131      132      133      134      135      136 
20.21365 14.97595 15.64310 15.52528 22.98658 23.45297 25.94404 22.98658 
     137      138      139      140      141      142      143      144 
19.26649 29.87712 25.94404 24.66535 16.82285 15.76357 16.14091 14.07622 
     145      146      147      148      149      150      151      152 
31.72186 28.12786 29.87712 24.66535 30.17795 30.17795 31.72186 24.66535 
     153      154      155      156      157      158      159      160 
19.26649 19.26649 24.66535 28.41278 33.33208 31.72186 24.66535 24.66535 
     161      162      163      164      165      166      167      168 
28.70034 24.17244 31.72186 37.46713 31.09634 31.09634 16.54211 22.30689 
     169      170      171      172      173      174      175      176 
25.16887 26.20774 20.21365 31.09634 32.68004 30.78756 32.35799 31.72186 
     177      178      179      180      181      182      183      184 
31.72186 28.70034 22.30689 28.41278 19.08502 18.38564 25.94404 25.16887 
     185      186      187      188      189      190      191      192 
25.94404 28.99056 30.78756 30.17795 31.09634 24.17244 23.21845 26.20774 
     193      194 
24.66535 26.74308 

Example: Validation Set Approach (8/)

Calculate the MSE of the quadratic model

Code
mpg_mse_quad <- mean((auto_mpg_test_data$mpg - auto_mpg_test_pred_quad)^2)
mpg_mse_quad
[1] 22.30821

Example: Validation Set Approach (9/)

Repeat the process for a cubic model

Code
auto_mpg_lm_cubic <- lm(mpg ~ horsepower + I(horsepower^2) + I(horsepower^3), data = auto_mpg_train_data)
summary(auto_mpg_lm_cubic)

Call:
lm(formula = mpg ~ horsepower + I(horsepower^2) + I(horsepower^3), 
    data = auto_mpg_train_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-14.5616  -2.1686  -0.0387   2.2816  12.1906 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)      6.169e+01  6.106e+00  10.103  < 2e-16 ***
horsepower      -5.936e-01  1.596e-01  -3.719 0.000261 ***
I(horsepower^2)  2.211e-03  1.298e-03   1.703 0.090240 .  
I(horsepower^3) -2.272e-06  3.307e-06  -0.687 0.492869    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.017 on 194 degrees of freedom
Multiple R-squared:  0.7306,    Adjusted R-squared:  0.7264 
F-statistic: 175.3 on 3 and 194 DF,  p-value: < 2.2e-16

Example: Validation Set Approach (10/)

Predict the test data of the cubic model

Code
auto_mpg_test_pred_cubic <- predict(auto_mpg_lm_cubic, newdata = auto_mpg_test_data)
auto_mpg_test_pred_cubic
       1        2        3        4        5        6        7        8 
13.72427 14.16573 13.50298 14.72080 14.16573 23.29990 23.29990 22.83595 
       9       10       11       12       13       14       15       16 
25.02291 38.84003 25.28183 19.56152 24.51471 13.67405 13.22028 13.13625 
      17       18       19       20       21       22       23       24 
25.02291 24.51471 22.16324 21.10305 25.02291 13.72427 13.21651 20.11761 
      25       26       27       28       29       30       31       32 
29.56159 22.16324 25.54399 30.50895 33.54048 27.18566 35.72305 25.54399 
      33       34       35       36       37       38       39       40 
14.72080 14.48018 13.41775 14.33147 13.99987 14.72080 25.28183 30.50895 
      41       42       43       44       45       46       47       48 
25.54399 24.01930 22.83595 27.18566 25.02291 13.33430 13.18759 14.72080 
      49       50       51       52       53       54       55       56 
14.12569 13.67405 22.16324 22.16324 25.02291 23.29990 13.21651 23.53655 
      57       58       59       60       61       62       63       64 
24.51471 20.69999 15.16954 14.46322 24.26542 23.29990 22.16324 31.15791 
      65       66       67       68       69       70       71       72 
27.18566 28.64519 20.11761 21.10305 15.67940 14.72080 14.72080 28.64519 
      73       74       75       76       77       78       79       80 
23.29990 29.56159 13.50298 14.72080 14.89307 21.10305 20.11761 23.29990 
      81       82       83       84       85       86       87       88 
20.11761 17.02466 28.64519 26.35002 22.83595 24.51471 23.29990 22.60863 
      89       90       91       92       93       94       95       96 
19.20522 26.90380 24.01930 27.47085 26.35002 15.67940 14.72080 22.16324 
      97       98       99      100      101      102      103      104 
21.10305 26.90380 36.47991 36.09963 22.16324 20.11761 28.64519 14.72080 
     105      106      107      108      109      110      111      112 
15.16954 16.88932 30.83168 15.16954 21.10305 22.60863 13.21651 13.50298 
     113      114      115      116      117      118      119      120 
14.80574 24.76721 26.35002 31.15791 27.75938 22.83595 20.11761 20.11761 
     121      122      123      124      125      126      127      128 
38.03829 31.48764 36.47991 33.54048 20.11761 15.67940 24.51471 25.80940 
     129      130      131      132      133      134      135      136 
20.11761 15.16954 15.78886 15.67940 22.83595 23.29990 25.80940 22.83595 
     137      138      139      140      141      142      143      144 
19.20522 29.87392 25.80940 24.51471 16.88932 15.90085 16.25210 14.33147 
     145      146      147      148      149      150      151      152 
31.82089 28.05127 29.87392 24.51471 30.18970 30.18970 31.82089 24.51471 
     153      154      155      156      157      158      159      160 
19.20522 19.20522 24.51471 28.34654 33.54048 31.82089 24.51471 24.51471 
     161      162      163      164      165      166      167      168 
28.64519 24.01930 31.82089 38.03829 31.15791 31.15791 16.62658 22.16324 
     169      170      171      172      173      174      175      176 
25.02291 26.07807 20.11761 31.15791 32.84193 30.83168 32.49803 31.82089 
     177      178      179      180      181      182      183      184 
31.82089 28.64519 22.16324 28.34654 19.03136 18.36418 25.80940 25.02291 
     185      186      187      188      189      190      191      192 
25.80940 28.94724 30.83168 30.18970 31.15791 24.01930 23.06637 26.07807 
     193      194 
24.51471 26.62526 

Example: Validation Set Approach (11/)

Calculate the MSE of the cubic model

Code
mpg_mse_cubic <- mean((auto_mpg_test_data$mpg - auto_mpg_test_pred_cubic)^2)
mpg_mse_cubic
[1] 22.25234

Example: Validation Set Approach (12/)

Compare the MSE of the models

Code
# Create a data frame of the MSE values
mpg_mse_df <- tibble(
  Model = c("Linear", "Quadratic", "Cubic"),
  MSE = round(c(mpg_mse_lm, mpg_mse_quad, mpg_mse_cubic), 2)
)

mpg_mse_df |> 
  gt::gt()
Model MSE
Linear 27.07
Quadratic 22.31
Cubic 22.25

DRY (Don’t Repeat Yourself)

Example: Validation Set Approach (13/)

Use the poly() function and use a for loop

Code
# Create a for loop to fit the models and calculate the MSE
mpg_model_order <- c()
mpg_mse <- c()

for (i in 1:6) {
  auto_mpg_lm <- lm(mpg ~ poly(horsepower, i), data = auto_mpg_train_data)
  auto_mpg_test_pred <- predict(auto_mpg_lm, newdata = auto_mpg_test_data)
  mpg_model_order[i] <- i
  mpg_mse[i] <- mean((auto_mpg_test_data$mpg - auto_mpg_test_pred)^2)
}

mpg_model_dry_df <- tibble(
  model_order = mpg_model_order,
  mpg_mse
)

mpg_model_dry_df

Example: Validation Set Approach (14/)

Plot the MSE values

Code
mpg_model_dry_df |> 
  ggplot(aes(x = model_order, y = mpg_mse)) +
  geom_point() +
  geom_line() +
  labs(
    title = "MSE vs. Model Order",
    x = "Model Order",
    y = "MSE"
  ) +
  labs(
    title = "Validation Set Approach to Model Validaition",
    subtitle = "Data split 50/50",
    x = "Model Order",
    y = "MSE",
    caption = "Source: Auto MPG dataset from UCI Machine Learning Repository"
  ) +
  scale_x_continuous(breaks = 1:6) +
  theme_light()

Example: Validation Set Approach (15/)

How does MSE vary with our choice of the “random seed” used to split the data?

Code
mse_seed <- function(seed_values) {
  mse_values <- numeric(length(seed_values))
  
  for (i in seq_along(seed_values)) {
    set.seed(seed_values[i])
    
    auto_mpg_train <- caret::createDataPartition(auto_mpg_9_final$mpg, p = 0.5, list = FALSE)
    auto_mpg_train_data <- auto_mpg_9_final[auto_mpg_train,]
    auto_mpg_test_data <- auto_mpg_9_final[-auto_mpg_train,]
    
    auto_mpg_lm <- lm(mpg ~ horsepower, data = auto_mpg_train_data)
    auto_mpg_test_pred <- predict(auto_mpg_lm, newdata = auto_mpg_test_data)
    
    mse_values[i] <- mean((auto_mpg_test_data$mpg - auto_mpg_test_pred)^2)
  }
  
  result <- tibble::tibble(Seed = seed_values, MSE = round(mse_values, 3))
  return(result)
}

# Example usage:
seed_values <- c(42, sample(1:9999, 9))
result <- mse_seed(seed_values)
result

Example: Validation Set Approach (16/)

Calculate the MSE values for different seeds and model orders

Code
mse_seed_poly <- function(seed_values, max_degree = 3) {
  results <- tibble()
  
  for (seed in seed_values) {
    set.seed(seed)
    
    auto_mpg_train <- caret::createDataPartition(auto_mpg_9_final$mpg, p = 0.5, list = FALSE)
    auto_mpg_train_data <- auto_mpg_9_final[auto_mpg_train,]
    auto_mpg_test_data <- auto_mpg_9_final[-auto_mpg_train,]
    
    for (degree in 1:max_degree) {
      formula <- as.formula(paste("mpg ~ poly(horsepower, degree =", degree, ")"))
      auto_mpg_lm <- lm(formula, data = auto_mpg_train_data)
      auto_mpg_test_pred <- predict(auto_mpg_lm, newdata = auto_mpg_test_data)
      
      mse <- mean((auto_mpg_test_data$mpg - auto_mpg_test_pred)^2)
      
      results <- rbind(results, data.frame(Seed = seed, Degree = degree, MSE = mse))
    }
  }
  
  return(results)
}

# Example usage:
set.seed(42)  # Set a seed for reproducibility
seed_values <- c(42, sample(1:9999, 10))
result_df <- mse_seed_poly(seed_values, max_degree = 6)

result_df

Plot the MSE values for different seeds

Code
result_df |> 
  ggplot(aes(x = Degree, y = MSE, color = factor(Seed))) +
  geom_line(show.legend = FALSE) +
  labs(
    title = "MSE vs. Model Order",
    x = "Model Order",
    y = "MSE"
  ) +
  labs(
    title = "Validation Set Approach to Model Validaition",
    subtitle = "Data split 50/50; 11 random seeds",
    x = "Model Order",
    y = "MSE",
    caption = "Source: Auto MPG dataset from UCI Machine Learning Repository"
  ) +
  scale_x_continuous(breaks = 1:6) +
  theme_light()

Example: Validation Set Approach (17/)

Mean values for the MSE

Code
result_df |> 
  group_by(Degree) |> 
  summarise(MSE = mean(MSE))

Example: Validation Set Approach (18/)

Plot the predicted values versus the actual values for order 2

Code
auto_mpg_lm <- lm(mpg ~ poly(horsepower, 2), data = auto_mpg_train_data)
auto_mpg_test_pred <- predict(auto_mpg_lm, newdata = auto_mpg_test_data)

auto_mpg_test_data |> 
  ggplot(aes(x = horsepower, y = mpg)) +
  geom_point() +
  geom_line(aes(y = auto_mpg_test_pred), color = "red") +
  labs(
    title = "Predicted vs. Actual Values",
    x = "Horsepower",
    y = "MPG"
  ) +
  labs(
    title = "Validation Set Approach",
    subtitle = "Data split 50/50; Model Order 2",
    x = "Horsepower",
    y = "MPG",
    caption = "Source: Auto MPG dataset from UCI Machine Learning Repository"
  ) +
  theme_light()

Example: Validation Set Approach (19/)

What about a Poisson model?

Code
# auto_mpg_poisson <- glm(mpg ~ horsepower, data = auto_mpg_train_data, family = "poisson")
# auto_mpg_test_pred_poisson <- predict(auto_mpg_poisson, newdata = auto_mpg_test_data, type = "response")

auto_mpg_gamma <- glm(mpg ~ horsepower, data = auto_mpg_train_data, family = Gamma(link = "log"))
auto_mpg_test_pred_gamma <- predict(auto_mpg_gamma, newdata = auto_mpg_test_data, type = "response")

# mpg_mse_poisson <- mean((auto_mpg_test_data$mpg - auto_mpg_test_pred_poisson)^2)
# mpg_mse_poisson

mpg_mse_gamma <- mean((auto_mpg_test_data$mpg - auto_mpg_test_pred_gamma)^2)
mpg_mse_gamma
[1] 23.80739

Example: Validation Set Approach (20/)

Plot the predicted values versus the actual values for the Poisson and gamma models

Code
auto_mpg_test_data |> 
  ggplot(aes(x = horsepower, y = mpg)) +
  geom_point() +
  geom_line(aes(y = auto_mpg_test_pred_gamma), color = "blue") +
  labs(
    title = "Predicted vs. Actual Values",
    x = "Horsepower",
    y = "MPG"
  ) +
  labs(
    title = "Validation Set Approach to Model Validaition",
    subtitle = "Data split 50/50; Gamma (blue line) model",
    x = "Horsepower",
    y = "MPG",
    caption = "Source: Auto MPG dataset from UCI Machine Learning Repository"
  ) +
  theme_light()

Leave-One-Out Cross Validation (LOOCV)

Leave-One-Out Cross Validation (LOOCV) All Data (n observations) 1 Iteration 1 2 Iteration 2 3 Iteration 3 ... n Training Set Validation Set (1 observation)

Example: Leave-One-Out Cross Validation (LOOCV) (1/)

Fit a linear model using glm()

Code
auto_mpg_glm <- glm(mpg ~ horsepower, data = auto_mpg_9_final)
summary(auto_mpg_glm)

Call:
glm(formula = mpg ~ horsepower, data = auto_mpg_9_final)

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 39.935861   0.717499   55.66   <2e-16 ***
horsepower  -0.157845   0.006446  -24.49   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 24.06645)

    Null deviance: 23819.0  on 391  degrees of freedom
Residual deviance:  9385.9  on 390  degrees of freedom
AIC: 2363.3

Number of Fisher Scoring iterations: 2

Fit a linear model using lm()

Code
auto_mpg_lm <- lm(mpg ~ horsepower, data = auto_mpg_9_final)
summary(auto_mpg_lm)

Call:
lm(formula = mpg ~ horsepower, data = auto_mpg_9_final)

Residuals:
     Min       1Q   Median       3Q      Max 
-13.5710  -3.2592  -0.3435   2.7630  16.9240 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 39.935861   0.717499   55.66   <2e-16 ***
horsepower  -0.157845   0.006446  -24.49   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.906 on 390 degrees of freedom
Multiple R-squared:  0.6059,    Adjusted R-squared:  0.6049 
F-statistic: 599.7 on 1 and 390 DF,  p-value: < 2.2e-16

WE perform linear regression using the glm() function rather than the lm() function because the former can be used together with cv.glm(). The cv.glm() function is part of the boot library (functions for bootstrapping).

Example: Leave-One-Out Cross Validation (LOOCV) (2/)

Perform LOOCV using the cv.glm() function

Code
library(boot)

auto_mpg_cv_glm <- cv.glm(auto_mpg_9_final, auto_mpg_glm, K = nrow(auto_mpg_9_final))
auto_mpg_cv_glm$delta
[1] 24.23151 24.23114


LOOCV estimate for the test MSE

\[ CV_{(n)} = \frac{1}{n} \sum_{i=1}^{n} \text{MSE}_i \]

Example: Leave-One-Out Cross Validation (LOOCV) (3/)

LOOCV is a general method and can be used with any kind of predictive modeling.


Advantages and Disadvantages of LOOCV

Advantages

  • Less bias as the method using n-1 observations to fit the model.
  • No randomness in the data split (unlike the validation approach).

Disadvantages

  • Computationally expensive for large datasets.

Example: Leave-One-Out Cross Validation (LOOCV) (4/)

Perform LOOCV over a range of polynomial degrees

Code
# Create a function to perform LOOCV over a range of polynomial degrees
auto_mpg_cv_glm_poly_loocv <- function(max_degree = 3) {
  results <- tibble()
  
  for (degree in 1:max_degree) {
    formula <- as.formula(paste("mpg ~ poly(horsepower, degree =", degree, ")"))
    auto_mpg_glm <- glm(formula, data = auto_mpg_9_final)
    
    cv_results <- cv.glm(auto_mpg_9_final, auto_mpg_glm, K = nrow(auto_mpg_9_final))
    
    mse <- cv_results$delta[1]
    
    results <- bind_rows(results, tibble(Degree = degree, MSE = mse))
  }
  
  return(results)
}

# Example usage:
result_df <- auto_mpg_cv_glm_poly_loocv(10)
result_df |> gt::gt()  
Degree MSE
1 24.23151
2 19.24821
3 19.33498
4 19.42443
5 19.03321
6 18.97864
7 18.83305
8 18.96115
9 19.06863
10 19.49093

k-Fold Cross-Validation

K-Fold Cross Validation All Data (n observations) 1 Iteration 1 2 Iteration 2 3 Iteration 3 ... k Iteration k Training Set Validation Set (1/k of data)

Example: k-Fold Cross-Validation (1/)

Perform k-Fold Cross-Validation using the cv.glm() function

Code
auto_mpg_cv_glm_kfold <- cv.glm(auto_mpg_9_final, auto_mpg_glm, K = 10)
auto_mpg_cv_glm_kfold$delta
[1] 24.30505 24.28609


k-Fold estimate for the test MSE

\[ CV_{(k)} = \frac{1}{k} \sum_{i=1}^{k} \text{MSE}_i \]

Example: k-Fold Cross-Validation (2/)

Perform k-Fold Cross-Validation over a range of polynomial degrees

Code
# Create a function to perform LOOCV over a range of polynomial degrees
auto_mpg_cv_glm_poly_kfold <- function(max_degree = 3, K = 10) {
  results <- tibble()
  
  for (degree in 1:max_degree) {
    formula <- as.formula(paste("mpg ~ poly(horsepower, degree =", degree, ")"))
    auto_mpg_glm <- glm(formula, data = auto_mpg_9_final)
    
    cv_results <- cv.glm(auto_mpg_9_final, auto_mpg_glm, K = K)
    
    mse <- cv_results$delta[1]
    
    results <- bind_rows(results, tibble(Degree = degree, MSE = mse))
  }
  
  return(results)
}

# Example usage:
result_df <- auto_mpg_cv_glm_poly_kfold(10, 10)
result_df |> gt::gt()  
Degree MSE
1 24.13767
2 19.25900
3 19.36803
4 19.38186
5 19.07045
6 18.89805
7 19.62119
8 18.87379
9 19.00811
10 19.75573

Example: k-Fold Cross-Validation (3/)

Compare the time for the LOOCV and k-fold CV methods

Code
# Time taken for LOOCV
system.time(auto_mpg_cv_glm_poly_loocv(10))
   user  system elapsed 
  5.211   0.162   5.375 


Code
# Time taken for k-Fold CV
system.time(auto_mpg_cv_glm_poly_kfold(10,10))
   user  system elapsed 
  0.217   0.003   0.220 

Bias-Variance Trade-Off

The bias-variance trade-off is a fundamental concept in machine learning that deals with the balance between a model’s ability to fit the training data (low bias) and its ability to generalize to new, unseen data (low variance).

Cross-validation helps us understand and manage this trade-off.

In cross-validation:

  1. We split our data into training and validation sets multiple times.
  2. For each split, we train our model on the training set and evaluate it on the validation set.
  3. We average the performance across all splits.

Bias:

  • Low bias: Model fits training data well but may over fit.
  • High bias: Model is too simple and underfits both training and validation data.

Variance:

  • Low variance: Model performs consistently across different validation sets.
  • High variance: Model’s performance varies significantly between validation sets.

As we adjust model complexity:

  1. Simple models:
    • High bias, low variance
    • Underfits training data
    • Similar (but poor) performance across validation sets
  2. Complex models:
    • Low bias, high variance
    • Fits training data well
    • Performance varies widely across validation sets

The goal is to find the sweet spot where the model complexity balances bias and variance, minimizing overall error.

Cross-validation helps us identify this balance by:

  1. Revealing over fitting: If performance is much better on training than validation sets
  2. Showing under fitting: If performance is poor on both training and validation sets
  3. Indicating generalization: Consistent performance across validation sets suggests good balance

The Bootstrap

Introduction to the Bootstrap

Bootstrap Visulaization

Example: The Bootstrap (1/)

Simulated Data of 100 pairs of returns.

Code
portfolio <- ISLR2::Portfolio
portfolio
Code
p1 <- portfolio |> 
  ggplot() +
  geom_line(aes(x = seq_along(X), y = X)) +
  geom_line(aes(x = seq_along(Y), y = Y), color = "red") +
  labs(
    title = "Simulated Portfolio Data",
    x = "Sequence",
    y = "Change in Returns"
  ) +
  annotate("text", x = 85, y = -2.5, label = "Investment X", color = "black") +
  annotate("text", x = 85, y = -2.75, label = "Investment Y", color = "red") +
  theme_light()

p2 <- portfolio |> 
  ggplot() +
  geom_point(aes(x = X, y = Y)) +
  labs(
    title = "Scatterplot of Simulated Portfolio Data",
    x = "X",
    y = "Y"
  ) +
  theme_light()

p1 + p2

Example: The Bootstrap (2/)

. . . we wish to invest a fixed sum of mony in two financial assests that yeild returns of \(X\) and \(Y\), respectively, where \(X\) and \(Y\) are random quantities. We will invest a fraction \(\alpha\) of our money in \(X\) and will invest the remaining \(1-\alpha\) in \(Y\) . . . we wish to choose \(\alpha\) to minimize the total risk, or variance of our investment.

\[ \alpha = \frac{\sigma_Y^2 - \sigma_{XY}}{\sigma_X^2 + \sigma_Y^2 - 2\sigma_{XY}} \]

where \(\sigma_X^2\) is the variance of \(X\), \(\sigma_Y^2\) is the variance of \(Y\), and \(\sigma_{XY}\) is the covariance between \(X\) and \(Y\).

(We don’t know these values, but can estimate them from the data, so in reality, we have \(\hat{\alpha}\).)

Example: The Bootstrap (3/)

Code
test_tibble <- tibble(
  x = 1:10,
  y = (1:10)^2
)

test_tibble
Code
# Load required library
library(tidyverse)

# Your original data frame
test_tibble <- tibble(
  x = 1:10,
  y = (1:10)^2
)

# Function to stack the data frame n times
stack_dataframe <- function(df, n_repeats) {
  map(1:n_repeats, \(repeat_num) {
    df |> 
      mutate(repeat_number = repeat_num)
    }) |> 
    list_rbind()
}

# Create the new stacked data frame (e.g., repeating 5 times)
stacked_df <- stack_dataframe(test_tibble, 5)

# View the result
print(stacked_df)
# A tibble: 50 × 3
       x     y repeat_number
   <int> <dbl>         <int>
 1     1     1             1
 2     2     4             1
 3     3     9             1
 4     4    16             1
 5     5    25             1
 6     6    36             1
 7     7    49             1
 8     8    64             1
 9     9    81             1
10    10   100             1
# ℹ 40 more rows

End of Module 8

References

Quinlan, R. 1993. Auto MPG.” UCI Machine Learning Repository.